/******************************************************************************
 * This program processes the Odean Data for the SUE paper
 * Input: Dropbox/Odean/*.sas7bdat
 * Date Created: Aug 4, 2022; 
******************************************************************************/


libname odean "D:\Dropbox\Odean";


*** Individual level of trading upon earnings announcement [0, 1]; 

** Sample includes: investors who owns the stock and who newly bought the stock; 

/* How many accounts are IRA? Ans: 40% ; 

proc freq data = odean.base; 
	table account_type account_type_1 account_type_2 ;
	run; 
*/

* Generate permno-rdq1 pairs from 1991 to 1996; 


proc import datafile="D:\Dropbox\SASLib\Thesis\Stata\sue20220326.dta"
            out=dataout dbms = dta replace;
run;

data permno_rdq1; 
	set dataout; 
	if 1990<=year(rdq1)<=1997; 
	keep permno rdq1 fyearq fqtr county outdegree eigenvector infc; 
	keep abcret01_dgtw_r abcret295_dgtw_r; 
	keep turn_ab01 turn_ab295; 
	keep vom_ab01_pre_r vom_ab95_pre_r; 
	keep d_ret0 d_turn_ab_pre_5 d_vom_ab_pre_5; 
	keep logme logbm idvol ior urban sp500 retail ep2_winsor evol2_winsor
		nrdq_rk2 rptlag; 
	keep log_xad sp500 retail pop_density sind_p_ctex; 
  keep mean_age_ct retire_ratio_ct median_income_ctex mean_edu movedin_ctex ; 
  keep day qtr yr; 
  keep rdq_min; 
  keep sue:;  
  run; 
    
    proc sql; 
      create table permno_rdq1 as select a.*, b.number 
      from permno_rdq1 as a, rep.tdays_2018 as b 
      where a.rdq1 = b.date and a.permno ne . 
      order by permno, rdq1; quit; 
    ** Check how many trading days are between the current rdq1 and the next rdq1; 
    * In case there are missing reports for the previous or the next qtr, set these varaibels to be missing as well; 
    * First, start with next rdq1; 
    proc sort data = permno_rdq1; by permno descending fyearq descending fqtr descending rdq1; run; 
    data permno_rdq1;
      set permno_rdq1;
      by  permno descending fyearq descending fqtr descending rdq1;
      evttime_nrdq1 = lag(number) - number;
      next_rdq1 = lag(rdq1);
      if (fyearq - lag(fyearq))*4+ fqtr - lag(fqtr) ne -1 then do;
        evttime_nrdq1 = .;
        next_rdq1 = .;
      end; 
      if first.permno then do;
        evttime_nrdq1 = . ; 
        next_rdq1 = .;
      end; 
      format next_rdq1 date7.;
      run; 
    * Second, calcualte the lag rdq1;
    proc sort data = permno_rdq1; 
      by permno  fyearq  fqtr  rdq1; 
      run; 
    data permno_rdq1;
      set permno_rdq1;
      by  permno fyearq fqtr rdq1;
      evttime_lrdq1 = lag(number) - number;
      lag_rdq1 = lag(rdq1);
      if (fyearq - lag(fyearq))*4+ fqtr - lag(fqtr) ne 1 then do;
        evttime_lrdq1 = .;
        lag_rdq1 = .;
      end; 
      if first.permno then do;
        evttime_lrdq1 = . ; 
        lag_rdq1 = .;
      end; 
      format lag_rdq1 date7.;
      run; 


    proc sql;
      create table permno_rdq1
      as select distinct a.*, max(a.lag_rdq1, b.date) as day_41, 
        max(a.number+evttime_lrdq1, b.number) as number_41
      from permno_rdq1 as a left join rep.tdays_2018 as b 
      on a.number - b.number = 41; 


      create table permno_rdq1
      as select distinct a.*, max(a.lag_rdq1, b.date) as day_11,
        max(a.number+evttime_lrdq1, b.number) as number_11
      from permno_rdq1 as a left join rep.tdays_2018 as b 
      on a.number - b.number = 11; 

      create table permno_rdq1
      as select distinct a.*, b.date as day1, a.rdq1 as day0,
        b.number as number1, a.number as number0
      from permno_rdq1 as a left join rep.tdays_2018 as b 
      on a.number - b.number = -1; 

      create table permno_rdq1
      as select distinct a.*, b.date as day2, 
        b.number as number2
      from permno_rdq1 as a left join rep.tdays_2018 as b 
      on a.number - b.number = -2; 

      create table permno_rdq1
      as select distinct a.*, b.date as day61,
        b.number as number61
      from permno_rdq1 as a left join rep.tdays_2018 as b
      on 
        (not missing(a.evttime_nrdq1) and a.number+a.evttime_nrdq1 -5 =
          b.number) or
        (missing(a.evttime_nrdq1) and a.number + 61-5 = b.number)
      order by permno, rdq1; 

      quit;



** Cusip to Permno and aggregate position/trades to households; 

* First, cusip to permno;

    %let user = actg3705; 
    %let pswd = enivrI2013;

    %let wrds = wrds.wharton.upenn.edu 4016;
    options comamid=TCP remote=WRDS;
    signon username= &user  password= &pswd;

    rsubmit;

    proc download data = crsp.dsenames out =  dsenames; quit;

    endrsubmit;
    signoff;



* Second, match permno and aggregate position and trade to household level;  

  * For positions; 
  data position; 
    set odean.position91-odean.position96; 
    run ;

  proc sort data = position nodup; by account_number position_date cusip; run; 

  * Use historic ncusip; 
  proc sql;
    create table position  
    as select distinct a.*, b.permno 
    from position as a left join dsenames as b
    on substr(strip(a.cusip), 1, 8) = strip(b.ncusip) and 
       position_date between b.namedt and b.nameendt; 
    quit; 

  proc sql;
    create table position_hh as select distinct a.household_number, 
      b.position_date, b.cusip, b.permno, sum(b.position_equity) as
      position_equity, sum(b.position_quantity) as position_quantity
    from odean.base as a left join position as b
    on a.account_number = b.account_number
    group by a.household_number, b.position_date, b.cusip
    order by a.household_number, b.position_date, b.cusip; 
    quit; 

  * For trades; 
  proc sort data = odean.trade out = trade nodup; 
    by account_number trade_date cusip; 
    run;

  proc sql;
    create table trade  
    as select distinct a.*, b.permno 
    from trade as a left join dsenames as b
    on substr(strip(a.cusip), 1, 8) = strip(b.ncusip) and 
       trade_date between b.namedt and b.nameendt; 
    quit; 

  proc sql;
    create table trade_hh as select distinct a.household_number, 
      b.trade_date, b.cusip, b.permno, sum(b.quantity) as
      quantity, b.price, sum(b.principal) as principal,
      b.buy_sell, sum(b.trade_commission) as trade_commission
    from odean.base as a left join trade as b
    on a.account_number = b.account_number
    group by a.household_number, b.trade_date, b.cusip, b.buy_sell, b.price
    order by a.household_number, b.trade_date, b.cusip, b.buy_sell, b.price; 
    quit;

  proc sort data = trade_hh nodupkey; 
    by household_number trade_date cusip buy_sell quantity price; 
    run; 

  data trade_hh; 
    set trade_hh; 
    trade_id + 1; 
    run; 

  * Generate portfolio value;
  data position_hh; 
    set position_hh; 
    dummy_stock = .; 
    if not missing(permno) then dummy_stock = 1; 
    run; 
  proc sql;
    create table ptf_hh as select distinct 
      household_number, position_date as date, 
      sum(position_equity) as ptf, 
      sum(position_equity*dummy_stock) as stock_ptf
    from position_hh
    group by household_number, position_date; 
    quit; 

    * 62124 households with stock portfolio, 66465 from Barber and Odean(2000);
    proc sort data = ptf_hh out = check; by household_number
    descending stock_ptf; run;
    data check;
      set check(where = (not missing(stock_ptf)));
      by household_number descending stock_ptf; 
      if first.household_number; 
      run; 

* Third, create permno-rdq1-houshould-trade_date-trade pairs (trade being ided
  by price and buy_sell);

  * Add all households with positive equity portfolio at [0, 61]; 
  * This could blow up;
  * This indeed blew up, so we can only include smaller samples; 
  * i.e., those who help the shares or newly bought it;  
  /*
    proc sql;
      create table permno_rdq1_hh1 as 
      select distinct 
        a.*, b.household_number, b.stock_ptf, b.date as ptf_date
      from permno_rdq1 as a left join ptf_hh(where = (not missing(stock_ptf)))
      as b
      on intnx("month", a.rdq1, -1, "E") = intnx("month",b.date, 0, "E")
      order by a.permno, a.rdq1, a.fyearq, a.qtr,b.household_number,
        b.date;
      quit;
  */

  * All households with positive positions at the time of [0, 61];
  proc sql;
    create table permno_rdq1_hh as select distinct a.*, 
      b.household_number, b.position_equity, b.position_quantity,
      intnx("month", b.position_date, 0, "E") as position_date,
      "061" as hh_matched_window
    from permno_rdq1 as a left join position_hh as b 
    on a.permno  = b.permno and intnx("month", a.rdq1, -1, "E") <= intnx
    ("month",b.position_date, 0, "E") <= intnx("month", a.day61, 0, "E")
    order by a.permno, a.rdq1, a.fyearq, a.qtr,b.household_number,
      b.position_date;
    quit;

    * Take earliest position if multiple; 
    data permno_rdq1_hh; 
      set permno_rdq1_hh; 
      by permno rdq1 fyearq fqtr household_number position_date; 
      if first.household_number; 
      run;


  * All households with positive positions at the time of [-41,-11];
  proc sql;
    create table permno_rdq1_hh_pre as select distinct a.*, 
      b.household_number, b.position_equity, b.position_quantity,
      intnx("month", b.position_date, 0, "E") as position_date,
      "-41-11" as hh_matched_window
    from permno_rdq1 as a left join position_hh as b 
    on a.permno  = b.permno and intnx("month", a.day_41, -1, "E") <= intnx
    ("month",b.position_date, 0, "E") <=intnx("month", a.day_11, 0, "E")
    order by a.permno, a.rdq1, a.fyearq, a.qtr,b.household_number,
      b.position_date;
    quit;

    * Take earliest position if multiple; 
    data permno_rdq1_hh_pre; 
      set permno_rdq1_hh_pre; 
      by permno rdq1 fyearq fqtr household_number position_date; 
      if first.household_number; 
      run;



  * All trades for holders; 

  proc sql; 
    create table permno_rdq1_hh_td as select distinct  
      /*
      coalesce(a.permno, b.permno) as permno, 
      coalesce(a.household_number, b.household_number) as household_number, 
      */
      a.*, b.trade_id, b.trade_date, 
      b.quantity as trade_quantity, b.price as trade_price, 
      b.principal as trade_amount, b.buy_sell, b.trade_commission
    from permno_rdq1_hh as a  left join trade_hh
      (where=(not missing(permno))) as b
    on a.permno = b.permno and a.household_number = b.household_number
      and a.rdq1<=b.trade_date <= a.day61
    order by a.permno, a.rdq1, a.household_number, b.trade_date; 
    quit;

  proc sql; 
    create table permno_rdq1_hh_td_pre as select distinct  
      /*
      coalesce(a.permno, b.permno) as permno, 
      coalesce(a.household_number, b.household_number) as household_number, 
      */
      a.*, b.trade_id, b.trade_date, 
      b.quantity as trade_quantity, b.price as trade_price, 
      b.principal as trade_amount, b.buy_sell, b.trade_commission
    from permno_rdq1_hh_pre as a  left join trade_hh
      (where=(not missing(permno))) as b
    on a.permno = b.permno and a.household_number = b.household_number
      and a.day_41<=b.trade_date <= a.day_11
    order by a.permno, a.rdq1, a.household_number, b.trade_date; 
    quit;   

    * Flag new buys, trade on existing positions, and zero trade; 
    * There are several occasions: 
      - 1: position_date = rdq1's last month end
        -- A: if there is trade matched to householder_number, 
              then mark trade
        -- B: if there is no trade matche dto this householder, 
              then mark zero trade
      -2: position_date > rdq1's last month end
        --A: if there is trade matched to householder, 
              then mark as new buy order
        --B: if ther is no trade matched to householder, 
              then it means the position was build in the same month 
              as rdq1 but the trade date was before rdq1, 
              mark as zero trade; 
    * All can be easily inferred from the number of trades per household; 

    * Number of trades per household within [0,1], [2, 61], and [0, 61];
    data permno_rdq1_hh_td;
      set permno_rdq1_hh_td permno_rdq1_hh_td_pre;
      dummy_01 = .; 
      dummy_261 = .; 
      dummy_41_11 = .; 
      if rdq1<=trade_date<=day1 then dummy_01 = 1; 
      if day1<trade_date<=day61 then dummy_261 = 1; 
      if day_41<=trade_date<=day_11 then dummy_41_11 = 1; 
      run; 

    proc sql;
      create table permno_rdq1_hh_td as select distinct *, 
        count(distinct trade_id) as ntrades_4161,
        count(distinct trade_id*dummy_01) as ntrades_01,
        count(distinct trade_id*dummy_261) as ntrades_261,
        count(distinct trade_id*dummy_41_11) as ntrades_41_11
      from permno_rdq1_hh_td
      group by permno, rdq1, fyearq, fqtr, household_number 
      order by permno, rdq1, fyearq, fqtr, household_number, trade_date;   
      quit; 



** Household-Permno SCI measures; 

* First, household county;

  * Census 2000 county fips, given that infobase data are of 1997;
  * www2.census.gov/programs-surveys/popest/technical-documentation/methodology/1990-2000/90s-fips.txt;
  data WORK.County_fips2000;
    %let _EFIERR_ = 0; /* set the ERROR detection macro variable */
    infile 'D:\Dropbox\Odean\county_fips2000.csv' delimiter = ',' MISSOVER DSD
    lrecl=32767 firstobs=2 ;
       informat FIPS best32. ;
       informat name $50. ;
       format FIPS best12. ;
       format name $50. ;
    input
                FIPS
                name $
    ;
    if _ERROR_ then call symputx('_EFIERR_',1);  /* set ERROR detection macro variable */
    run;

    * drop states and only keep counties; 
    proc sql;
      create table county_fips2000_1 as select fips, name as county_name
      from county_fips2000
      where mod(fips, 1000) ne 0;  
      quit;

    * Drop "COUNTY", "BOROUGH", and "CENSUS AREA" from county names; 
    data county_fips2000_1; 
      set county_fips2000_1;
      county_name = transtrn(county_name," County", trim('')); 
      county_name = transtrn(county_name," Borough", trim('')); 
      county_name = transtrn(county_name," Census Area", trim('')); 
      county_name = transtrn(county_name," Parish", trim('')); 
      county_name = transtrn(county_name,"St. ", trim('Saint ')); 
      county_name = transtrn(county_name,"St  ", trim('Saint ')); 
      county_name = strip(upcase(county_name));
      run;
    * Add state name; 
    proc sql;
      create table county_fips2000_1 as select distinct a.*, b.state
        as statefips, b.statecode
      from county_fips2000_1 as a, sashelp.zipcode as b 
      where floor(a.fips/1000) = b.state; 
      quit; 


  * Circa 2000 zip mapping file;
  * https://mcdc.missouri.edu/cgi-bin/broker?_PROGRAM=utils.uex2dex.sas&path=/data/georef&dset=zipcodes&view=0; 
  proc sql;
    create table zip_cntyname as select distinct a.cntyname, a.stab, 
      b.state as statefips 
    from odean.zipcode as a, sashelp.zipcode as b  
    where upcase(strip(a.stab)) = upcase(strip(b.statecode)) and not missing
      (a.cntyname)
    order by b.state, a.cntyname ;
    quit;

  proc sql;
    create table zip_cntyname as select distinct a.*, b.fips as county_fips,
      b.county_name
    from zip_cntyname as a left join
      county_fips2000_1 as b
    on a.statefips = b.statefips and 
      upcase(strip(compress(a.cntyname, "","ka"))) = upcase(strip(compress
        (b.county_name, "", "ka")))
    order by a.statefips, a.cntyname; 
    quit; 


    * Some manual work to be down to make it complete; 

    data WORK.ZIP_CNTYNAME1    ;
      %let _EFIERR_ = 0; /* set the ERROR detection macro variable */
      infile 'D:\Dropbox\Odean\zip_cntyname_manual.csv' delimiter = ',' MISSOVER DSD
          lrecl=32767 firstobs=2 ;
         informat CntyName $50. ;
         informat Stab $2. ;
         informat statefips best32. ;
         informat county_fips best32. ;
         format CntyName $50. ;
         format Stab $2. ;
         format statefips best12. ;
         format county_fips best12. ;
      input
                  CntyName $
                  Stab $
                  statefips
                  county_fips
      ;
      if _ERROR_ then call symputx('_EFIERR_',1);  /* set ERROR detection macro variable */
      run;

  * county_fips to 2000 zip file; 

  proc sql;
    create table zip_county as select distinct a.zip, b.county_fips, a.cntyname,
      a.stab
    from odean.zipcode as a left join ZIP_CNTYNAME1 as b 
    on a.CntyName = b.CntyName and a.stab = b.stab; 
    quit; 

    data check ;
      set zip_county; 
      if missing(county_fips); 
      run; 

    proc sql;
      create table check1 as select distinct a.zip, 1000*b.state+b.county 
        as county_fips, b.countynm as cntyname, a.stab
      from check as a, sashelp.zipcode as b 
      where input(a.zip, 32.) = b.zip ; 
      quit;  

    data zip_county; 
      set zip_county(where = (not missing(county_fips)))
          check1; 
      run; 

    * save the file;

    data odean.zip_county_2000;
      set zip_county;
      run;


  * Now, match county_fips to households; 

  proc sql;
    create table infobase as select distinct
      input(a.household_account, 32.) as household_number,
      a.zip, a.match_indicator, a.martial_status as martial_status,
      a.HH_gender,
      a.hh_age2yr_incr as hh_age, a.income_code, b.county_fips
    from odean.infobase as a left join zip_county as b
    on input(a.zip, 32.) = input(b.zip, 32.);
    quit;
    /* Roughly 5% cannot be matched */ 

    /*
    * Use sashelp.zipcode for unmatched zip codes: zero obs can be matched; 
    proc sql;
      create table inforbase1 as select distinct 
        a.household_number, a.zip, a.match_indicator, a.martial_status,
        a.hh_gender, a.hh_age, a.income_code,
        1000*b.state+b.county as county_fips 
      from infobase (where = (missing(county_fips))) as a
        ,sashelp.zipcode as b 
      where input(a.zip, 32.) = b.zip; 
      quit;  
    */

  proc sort data = infobase; by household_number county_fips; run; 
  data infobase; set infobase; by household_number county_fips; 
    if first.household_number;
    run; 

  * Merge back to main sample; 

  proc sql;
    create table main1 as select distinct a.*, b.*
    from permno_rdq1_hh_td as a left join infobase as b 
    on a.household_number = b.household_number; 
    quit; 

    /* 62% trades can be geocoded; 
    data check;
      set main1; 
      if missing(county_fips) and not missing(household_number);
      run; * Roughly one thirds are missing, since infobase only covers
      roughly two thirds households as well;  
    */

* Second, now household-permno sci measures;

  * Facebook SCI data and population;  

  PROC IMPORT OUT= WORK.facebook
              DATAFILE= "D:\Dropbox\facebook\friendship link data (external)\SCI Data\County_SCI.csv" 
              DBMS=CSV REPLACE;
       GETNAMES=YES;
       DATAROW=2; 
  RUN;

    * ID counties from 1 to 3200;
    proc sort data = facebook (keep = own_county) out = id nodupkey; by own_county; run;
    data id; 
      set id; 
      id +1 ;
      run ;
    proc sql;
      create table fb0
      as select a.*, b.id as from
      from facebook as a, id as b
      where a.own_county = b.own_county;

      create table fb1
      as select a.*, b.id as to
      from fb0 as a, id as b
      where a.friend_county = b.own_county;
      quit; 


    * Add population;
    data pot; 
      retain fips pop_ct land; 
      set rep.county_ctrl_2018;
      if year = 2016;
      keep fips pop_ct land; 
      run; 
    proc sql;
      create table fb2
      as select a.*, b.pop_ct as own_county_pop
      from fb1 as a, pot as b
      where a.own_county = b.fips;

      create table fb2
      as select a.*, b.pop_ct as friend_county_pop
      from fb2 as a, pot as b
      where a.friend_county = b.fips;
      quit; 

    * Add population weigthed location coordinates; 

    PROC IMPORT OUT= WORK.county_loc
                DATAFILE= "D:\Dropbox\SASLib\Thesis\County_locations.csv" 
                DBMS=CSV REPLACE;
         GETNAMES=Yes;
         DATAROW=2; 
    RUN;

    * Match lat and lon to home county and calcualte pairwise distances; 
    proc sql;
      create table fb2
      as select a.*, b.INTPTLAT as own_county_lat, b.INTPTLONG as own_county_lon
      from fb2 as a, county_loc as b 
      where a.own_county = geoid; 
      quit; 

    proc sql;
      create table fb2
      as select a.*, b.INTPTLAT as friend_county_lat, b.INTPTLONG as friend_county_lon
      from fb2 as a, county_loc as b
      where a.friend_county=b.geoid; 
      quit;

    * Outdegree of own county; 
    proc sql;
      create table fb2
      as select distinct *, sum(sci) as outdegree
      from fb2
      group by own_county; 
      quit; 


/********************************************************************************
                      Extrapolate portfolio position data 
********************************************************************************/
* Cause: there exists entended period of time when trades are recorded but not 
  portfolio positions; 
* The purpose of the section is to extend the postion data by backing out based on 
  trades; 

* First, are there any gaps in position data? ; 
  
  proc sql;
    create table time1 as select distinct
      household_number, 
      min(intnx("month", date,0, "E")) as begdt, 
      max(intnx("month", date,0, "E")) as enddt
    from ptf_hh
    group by household_number
    order by household_number; 
    quit; 

  data time1; 
    set time1; 
    date = begdt; 
    output;  
    do while(date<enddt);
      date = intnx("month", date, 1, "E"); 
      output; 
    end; 
    format date date9.; 
    format begdt date9.; 
    format enddt date9.; 
    run;

  proc sort data = time1 nodupkey; by household_number date; run ;

  data ptf_hh1(rename = (date_day = date)); 
    set ptf_hh; 
    date_day = intnx("month", date, 0, "E");
    drop date;
    format date_day  date9.;  
    run ;

  data time2; 
    merge time1(in =a) ptf_hh1(in= b); 
    by household_number date; 
    in1 = a; 
    in2 = b; 
    keep household_number date begdt enddt in1 in2;
    run;

  data check ;
    set time2; 
    if in1 = 1 and in2 ne 1; 
    run; * 1/3 are missing, that's a lot!!!!!; 

* Second, span time range with trade data and fill in; 

  * Time range based on holding; 

  proc sql;
    create table time1 as select distinct
      household_number, 
      min(intnx("month", date,0, "E")) as begdt
      format = date9., 
      max(intnx("month", date,0, "E")) as enddt
      format = date9.
    from ptf_hh 
    where not missing(date)
    group by household_number
    order by household_number; 
    quit; 

  * Time range based on trading; 

  proc sql;
    create table time2 as select distinct
      household_number, 
      min(intnx("month", trade_date,0, "E")) as begdt_t 
      format = date9., 
      max(intnx("month", trade_date,0, "E")) as enddt_t
      format = date9.
    from trade_hh
    where not missing(trade_date)
    group by household_number
    order by household_number; 
    quit; 

  * Combine two time ranges; 

  data time3; 
    merge time1 (in = a) time2(in = b); 
    by household_number; 
    in1 = a; 
    in2 = b; 
    run;

    * 36,645 households have more extensive trading time range; 

    data check ;
      set time3; 
      if enddt_t>enddt or begdt_t<begdt and 
         cmiss(enddt, enddt_t, begdt, begdt_t) = 0 ; 
      run ; 

  * Expand time series and mark missing month for positions; 

  data time4; 
    set time3; 
    date = min(begdt, begdt_t);
    output;  
    do while(date<max(enddt, enddt_t));
      date = intnx("month", date, 1, "E"); 
      output; 
    end; 
    format date date9.; 
    format begdt date9.; 
    format enddt date9.; 
    format begdt_t date9.; 
    format enddt_t date9.; 
    run;

  data time4; 
    merge time4 ptf_hh1;
    by household_number date; 
    if missing(ptf) and date> begdt then miss = 1; 
    if missing(ptf) and date<= begdt then miss = 2; 
    run ;
    * 1 if missing after ptf end date; 
    * 2 if missing in between ptf range; 


* Third, extrapolate positions; 
  
  * Time range for each asset in the ptf; 

    data position_hh; 
      set position_hh; 
      cusip8 = substr(strip(cusip), 1, 8); 
      run; 
    data trade_hh; 
      set trade_hh; 
      cusip8 = substr(strip(cusip), 1, 8); 
      date = intnx("month", trade_date, 0, "E"); 
      run; 
    proc sql;
      create table cusip_range as select distinct 
        household_number, cusip8 as cusip,
        min(intnx("month", position_date,0, "E")) as cusip_begdt
        format = date9., 
        max(intnx("month", position_date,0, "E")) as cusip_enddt
        format = date9.
      from position_hh
      where not missing(cusip)
      group by household_number, cusip8
      order by household_number, cusip8; 
      quit; 
    proc sql;
      create table cusip_range_t as select distinct 
        household_number, cusip8 as cusip,
        min(intnx("month", trade_date,0, "E")) as cusip_begdt_t
        format = date9., 
        max(intnx("month", trade_date,0, "E")) as cusip_enddt_t
        format = date9.
      from trade_hh
      where not missing(cusip)
      group by household_number, cusip8
      order by household_number, cusip8; 
      quit;
    proc sql;
      create table cusip_range_all as select distinct
        coalesce(a.household_number, b.household_number) as household_number, 
        coalesce(substr(strip(a.cusip), 1,8), substr(strip(b.cusip), 1,8)) as cusip, 
        a.cusip_begdt, a.cusip_enddt, b.cusip_enddt_t, b.cusip_begdt_t
      from cusip_range as a full outer join cusip_range_t as b 
      on a.household_number = b.household_number and 
         substr(strip(a.cusip), 1, 8) =substr(strip(b.cusip), 1, 8); 
      quit; 
    proc sort data = cusip_range_all nodupkey; by household_number cusip; run ;

  * Populate household-cusip-time(in max time range) triplets; 

    proc sql;
      create table time5 as select distinct 
        a.*, b.*
      from time4 as a left join cusip_range_all as b 
      on a.household_number = b.household_number     
      order by a.household_number, b.cusip, a.date; 
      quit; 
    proc sort data = time5 nodupkey; by household_number cusip date; run ;

  * Position data; 

    * Ocasionally, more than one positions per cusip8 in the same month, aggregate; 

    proc sql;
      create table position_hh_agg as select distinct
        household_number, cusip8 as cusip,
        position_date, 
        sum(position_quantity) as position_quantity, 
        sum(position_equity) as postion_equity
      from position_hh
      group by household_number, cusip8, position_date 
      order by household_number, cusip8, position_date;
      quit;

    proc sql;
      create table time5 as select distinct 
        a.*, b.position_equity, b.position_quantity
      from time5 as a left join position_hh as b 
      on a.household_number = b.household_number and 
         substr(strip(a.cusip), 1, 8) =substr(strip(b.cusip), 1, 8) and 
         intnx("month", a.date, 0, "E") = intnx("month", b.position_date, 0, "E")
      order by a.household_number, a.cusip, a.date; 
      quit; 
    proc sort data = time5; by household_number cusip date position_quantity; run ;
    data time5; set time5; by household_number cusip date position_quantity; 
      if last.date; run; 

  * Same-month trade data;

    * Ocasionally, more than one trades per cusip in the same month, aggregate; 

    proc sql;
      create table trade_hh_agg as select distinct
        household_number, cusip8 as cusip, permno, 
        sum(quantity) as net_trade, 
        sum(principal) as net_trade_amount, 
        mean(price) as trade_price_avg, 
        trade_date, date
      from trade_hh
      group by household_number, cusip8, date 
      order by household_number, cusip8, date, trade_date ; 
      quit;

    * Keep the latest trade date; 

    data trade_hh_agg; 
      set trade_hh_agg; 
      by household_number cusip date trade_date; 
      if last.date; 
      run; 

    proc sql;
      create table time5 as select distinct 
        a.*, b.quantity as trade_quantity, 
        b.principal as trade_amount, 
        b.price as trade_price
      from time5 as a left join trade_hh as b 
      on a.household_number = b.household_number and 
         substr(strip(a.cusip), 1, 8) =substr(strip(b.cusip), 1, 8) and 
         intnx("month", a.date, 0, "E") = intnx("month", b.trade_date, 0, "E")
      order by a.household_number, a.cusip, a.date; 
      quit;

    proc sort data = time5; by household_number cusip date trade_quantity; run ;
    data time5; set time5; by household_number cusip date trade_quantity; 
      if last.date; run; 


  * Add position cusip price; 
  * Three step of matching; 

    * 1. Exact date match from odean price file;

      * From trades and position files; 

        proc sql;
          create table position_price as select distinct
            cusip8 as cusip, 
            intnx("month", position_date, 0, "E") as date format = date9., 
            position_equity/position_quantity as price
          from position_hh
          where not missing(cusip8)
          order by cusip8, date;
          quit;

        proc sql;
          create table trade_price as select distinct
            cusip8 as cusip, 
            trade_date as date, 
            price
          from trade_hh
          where not missing(cusip8)
          order by cusip8, trade_date;
          quit;

        data price; 
          merge position_price trade_price; 
          by cusip date; 
          run;

      * Take average if multiple prices on a given day; 

        proc means data = price noprint; 
          by cusip date; 
          var price; 
          output out=price1(drop = _type_ _freq_) mean=price;
          run; 

      * proc sql;
      *   create table time6 as select distinct 
      *     a.*, b.price as price1
      *   from time5 as a left join price as b 
      *   on substr(strip(a.cusip), 1, 8) =substr(strip(b.cusip), 1, 8) and 
      *      intnx("month", a.date, 0, "E") = b.date ;
      *   quit; 

    * 2. From crsp DSF file;

      libname crsp "E:\CRSP";

      * Last day of a month may not be a trading day;
      * Make sure to use historical ncusip instead of cusip; 

      data crsp_d; 
        set crsp.crsp_d(rename = date = date1 drop = cusip); 
        prc = abs(prc); 
        date= intnx("month", date1, 0, "E"); 
        rename ncusip = cusip;  
        keep permno ncusip prc date date1; 
        run;
      proc sort data = crsp_d; by cusip date date1; run;
      data crsp_d; 
        set crsp_d; 
        by cusip date date1; 
        retain _x; 
        if first.cusip or prc ne . then _x = prc; 
        if not first.cusip or prc = . then prc = _x; 
        if last.date then output; 
        run; 

      * data time6(drop = _x);
      *   merge time5(where=(not missing(cusip)) in = a) crsp_d(in= b) price1(in = c); 
      *   by cusip date; 
      *   if a and (b or c) ; 
      *   run; 

      * proc sql;
      *   create table time6 as select distinct 
      *     a.*, b.price as price1
      *   from time6 as a left join price as b 
      *   on substr(strip(a.cusip), 1, 8) =substr(strip(b.cusip), 1, 8) and 
      *      intnx("month", a.date, 0, "E") = b.date;
      *   quit;

    * 3. Match to nearest if still missing, extrapolate from 


      * Combine cusip-date pairs from position, trade, and max range; 

      proc sql;
        create table cusip_date0 as select distinct 
          cusip, date
        from time5
        order by cusip, date; 
        quit;

      data cusip_date;
        set cusip_date0 price1; 
        run;

      proc sort data = cusip_date; by cusip date price; run; 
      data cusip_date; 
        set cusip_date; 
        by cusip date price; 
        if last.date;
        if not missing(cusip); 
        if not missing(date);  
        run;

      * Fill in last non-missing; 

      data cusip_date(drop = _x); 
        set cusip_date; 
        by cusip date; 
        retain _x; 
        if first.cusip or price ne . then _x = price; 
        if not first.cusip or price = . then price = _x; 
        if last.date then output; 
        run; 

      * Fill in the next non-missing (for those missing a section in the begining); 

      proc sort data = cusip_date; by cusip descending date; run; 

      data cusip_date(drop = _x); 
        set cusip_date; 
        by cusip descending date; 
        retain _x; 
        if first.cusip or price ne . then _x = price; 
        if not first.cusip or price = . then price = _x; 
        if last.date then output; 
        run; 

    * Combine 1-3; 

      proc sort data = time5; by cusip date; run;
      proc sort data = crsp_d; by cusip date; run;
      proc sort data = cusip_date; by cusip date; run;


      data time6; 
        merge time5(in = a) crsp_d(in = b) cusip_date(in = c); 
        by cusip date;  
        if a;
        price_final = coalesce(position_equity/position_quantity, prc, price); 
        drop _x;  
        run;


  * For missing position_equity and position_quantity, infer using the lagged 
    position data the same-month trade data; 

    proc sort data = time6; by household_number cusip date; run ;
    data time6(drop = _x); 
      set time6; 
      by household_number cusip date; 
      position_quantity_bk = position_quantity;
      retain _x; 
      if first.cusip and position_quantity = . then position_quantity= 0; 
      if first.cusip or position_quantity ne . then _x = position_quantity;
      if not first.cusip and position_quantity = . then do; 
        position_quantity = sum(_x, trade_quantity);  
      end; 
      if first.cusip or position_quantity ne . then _x = position_quantity;
      run; 

    * Any negative positions?;
    * Yes, around 3%, due to initial positions not being recorded for subsequent sales; 
    /*data check; set time6; if position_quantity <0; run;*/

  * Recreate position_hh data; 

    data position_hh_extended; 
      set time6; 
      position_quantity_final = max(0, position_quantity); 
      position_equity_final = coalesce(position_equity, position_quantity_final*price_final); 
      keep household_number cusip date position_equity_final price_final position_quantity_final; 
      if position_quantity_final >0; 
      run;

    * Add permno; 

    proc sql;
      create table position_hh_extended as select distinct 
        a.*, b.permno
      from position_hh_extended as a left join dsenames as b 
      on substr(strip(a.cusip), 1, 8) = strip(b.cusip) and 
         a.date between b.namedt and b.nameendt; 
      quit; 

    * Merge with postion_hh; 

    data position_hh; 
      set position_hh; 
      cusip = substr(strip(cusip), 1, 8);
      date = intnx("month", position_date, 0, "E");  
      run;

    proc sort data = position_hh_extended; by household_number date cusip; run; 
    proc sort data = position_hh; by household_number date cusip; run; 

    data position_hh_final; 
      merge position_hh_extended(keep = household_number cusip date position_quantity_final 
            price_final position_equity_final permno
            where = (not missing(cusip) and cusip ne "missings"))
            position_hh;      
      by household_number date cusip;
      price = coalesce(position_equity/position_quantity, price_final); 
      position_quantity = coalesce(position_quantity, position_quantity_final); 
      position_equity = coalesce(position_equity, position_equity_final); 
      keep household_number date cusip permno price position_quantity position_equity; 
      run;

  * Recreate ptf_hh data;

    proc sql;
      create table ptf_hh_extended as select distinct 
        household_number, date, 
        sum(position_equity) as ptf_extended, 
        sum(position_equity*(not missing(permno))) as stock_ptf_extended
      from position_hh_final
      group by household_number, date; 
      quit; 

    data ptf_hh; set ptf_hh; date = intnx("month", date, 0, "E"); run ; 
    proc sort data = ptf_hh; by household_number date; run; 

    data ptf_hh_final; 
      merge ptf_hh_extended ptf_hh; 
      by household_number date; 
      ptf = coalesce(ptf, ptf_extended); 
      stock_ptf = coalesce(stock_ptf, stock_ptf_extended); 
      keep household_number date ptf stock_ptf; 
      run; 

/********************************************************************************
                 * Reconstruct household level tests; 
********************************************************************************/

** Baseline permno-rdq1-household tests;
* Construct several baseline samples: 
  1. All households holding/traded the stock before rdq1
  2  All households holding the stock during [-41, -11]
  3. All households holding the stock in the past 12 month before rdq1
  4. All actively trading households based on number of trades
  5. All households who traded or owned peer stocks;


  * 1. All households holding the stock before rdq1; 

    * Holded; 
    proc sql;
      create table sample1_a as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr,
        b.household_number, 
        1 as holding_rdq1
      from permno_rdq1 as a, position_hh_final as b 
      where a.permno = b.permno and intnx("month", a.rdq1, -1, "E")
        = intnx("month", b.date, 0, "E") 
      order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
      quit; 

    * Traded;
    proc sql;
      create table sample1_b as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr,
        b.household_number, 
        1 as trading_rdq1
      from permno_rdq1 as a, trade_hh as b 
      where a.permno = b.permno and intnx("month", a.rdq1, -2, "E")
        < trade_date <= intnx("month", a.rdq1, -1, "E")
      order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
      quit; 

    * Combined; 
    data sample1; 
      merge sample1_a sample1_b; 
      by permno rdq1 fyearq fqtr household_number; 
      run ;

    proc sql;
      create table sample1 as select distinct 
        permno, rdq1, fyearq, fqtr,
        household_number, 
        sum(holding_rdq1) as holding_rdq1, 
        sum(trading_rdq1) as trading_rdq1
      from sample1
      group by permno, rdq1, fyearq, fqtr, household_number
      order by permno, rdq1, fyearq, fqtr, household_number; 
      quit; 

  * 2. All households holding/traded the stock during [-41, -11]; 

    proc sql;
      create table sample2_a as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr,
        b.household_number, 
        1 as holding_pre
      from permno_rdq1 as a, position_hh_final as b 
      where a.permno  = b.permno and intnx("month", a.day_41, -1, "E") <= intnx
      ("month",b.date, 0, "E") <=intnx("month", a.day_11, 0, "E")
      order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
      quit;

    * Traded;
    proc sql;
      create table sample2_b as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr,
        b.household_number, 
        1 as trading_pre
      from permno_rdq1 as a, trade_hh as b 
      where a.permno = b.permno and a.day_41 <= b.trade_date<=a.day_11
      order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
      quit; 

    * Combined; 
    data sample2; 
      merge sample2_a sample2_b; 
      by permno rdq1 fyearq fqtr household_number; 
      run ;

    proc sql;
      create table sample2 as select distinct 
        permno, rdq1, fyearq, fqtr,
        household_number, 
        sum(holding_pre) as holding_pre,
        sum(trading_pre) as trading_pre
      from sample2
      group by permno, rdq1, fyearq, fqtr, household_number
      order by permno, rdq1, fyearq, fqtr, household_number; 
      quit; 

  * 3. All householdes holding/trading the stock in the past 12 month; 

    proc sql;
      create table sample3_a as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr,
        b.household_number, 
        1 as holding_12m
      from permno_rdq1 as a, position_hh_final as b 
      where a.permno  = b.permno and intnx("month", a.rdq1, -11, "S") <= intnx
      ("month",b.date, 0, "E") <=intnx("month", a.rdq1, 0, "S")
      order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
      quit;

    * Traded;
    proc sql;
      create table sample3_b as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr,
        b.household_number, 
        1 as trading_12m
      from permno_rdq1 as a, trade_hh as b 
      where a.permno  = b.permno and intnx("month", a.rdq1, -11, "S") <
       b.trade_date <=intnx("month", a.rdq1, 0, "S")
      order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
      quit; 

    * Combined; 
    data sample3; 
      merge sample3_a sample3_b; 
      by permno rdq1 fyearq fqtr household_number; 
      run ;

    proc sql;
      create table sample3 as select distinct 
        permno, rdq1, fyearq, fqtr,
        household_number, 
        sum(holding_12m) as holding_12m, 
        sum(trading_12m) as trading_12m
      from sample3
      group by permno, rdq1, fyearq, fqtr, household_number
      order by permno, rdq1, fyearq, fqtr, household_number; 
      quit; 

  * 3.1 All householdes holding/trading the stock in the past 36 month; 

    proc sql;
      create table sample3_a as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr,
        b.household_number, 
        1 as holding_36m
      from permno_rdq1 as a, position_hh_final as b 
      where a.permno  = b.permno and intnx("month", a.rdq1, -35, "S") <= intnx
      ("month",b.date, 0, "E") <=intnx("month", a.rdq1, 0, "S")
      order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
      quit;

    * Traded;
    proc sql;
      create table sample3_b as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr,
        b.household_number, 
        1 as trading_36m
      from permno_rdq1 as a, trade_hh as b 
      where a.permno  = b.permno and intnx("month", a.rdq1, -36, "S") <
       b.trade_date <=intnx("month", a.rdq1, 0, "S")
      order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
      quit; 

    * Combined; 
    data sample31; 
      merge sample3_a sample3_b; 
      by permno rdq1 fyearq fqtr household_number; 
      run ;

    proc sql;
      create table sample31 as select distinct 
        permno, rdq1, fyearq, fqtr,
        household_number, 
        sum(holding_36m) as holding_36m, 
        sum(trading_36m) as trading_36m
      from sample31
      group by permno, rdq1, fyearq, fqtr, household_number
      order by permno, rdq1, fyearq, fqtr, household_number; 
      quit; 

  * 4. All actively trading households based on number of trades; 

    * Summary stat on trading frequency; 

      * hh month pairs; 
      data trade_hh; 
        set trade_hh;
        year = year(trade_date);
        dummy_stock = .; 
        if not missing(permno) then dummy_stock =1;  
        run;

      proc sql;
        create table hh_time as select distinct
          household_number, household_open_date
        from odean.base 
        order by household_number, household_open_date; 
        quit; 

      * Only starts filling in from account opening date; 
      data hh_time; 
        set hh_time; 
        date = max(mdy(01, 31, 1991), intnx("month", household_open_date, 0, "E"));
        output; 
        do while (date<mdy(12,31,1996)); 
          date = intnx("month", date, 1, "E"); 
          output; 
        end; 
        format date date9.; 
        run; 

      * Link trades to month; 
      proc sql;
        create table trade_freq as select distinct 
          a.*, 
          count(distinct b.trade_id*b.dummy_stock) as ntrades_stock, 
          count(distinct b.trade_id) as ntrades
        from hh_time as a left join trade_hh(where = (not missing(trade_date))) as b 
        on a.household_number = b.household_number and 
          intnx("month", a.date, 0, "E")= intnx("month", b.trade_date, 0, "E")
        group by a.household_number, a.date 
        order by a.household_number, a.date;
        quit;

      * Rolling 12 month sum, trimleft 11; 
      proc expand data = trade_freq
                  out  = trade_freq;
          by household_number;
          id date;
          convert ntrades= ntrades_12m / method=none transform=(movsum 12 trimleft 11);
          convert ntrades_stock= ntrades_stock_12m / method=none transform=(movsum 12 trimleft 11);
          run; 

      * Add county_fips to household, for possible sample downsizing; 
      proc sql;
        create table trade_freq as select distinct
          a.*, b.county_fips
        from trade_freq as a left join infobase as b
        on a.household_number =b.household_number
        order by a.household_number, a.date;  
        quit; 

      * Export to Stata; 
      PROC EXPORT DATA= WORK.trade_freq
              OUTFILE= "D:\Dropbox\SASLib\Thesis\Stata\trade_freq.dta" 
              DBMS=STATA REPLACE;
        RUN;

      * setting at 10 trades in the last 12 month leaves roughtly 5500 households; 
      * Still going to blow up I am afriad; 

      proc sql;
        create table sample4 as select distinct 
          a.permno, a.rdq1, a.fyearq, a.fqtr, 
          b.household_number, b.ntrades_stock_12m length = 3 
        from permno_rdq1 as a, trade_freq(where = (not missing(ntrades_stock_12m)
           and ntrades_stock_12m>=10 and not missing(county_fips))) as b
        where intnx("month", a.rdq1, -1, "E") = intnx("month", b.date, 0, "E");
        quit; 

      * This ends with half a billion obs and 21GB datasize; 
      * To save time, need to first construct firm and hh controls before joining; 

      data check; 
        set sample4; 
        if not missing(county_fips); 
        run;  
     
  * 5. All households who traded or owned peer stocks;

    * TNIC classifications; 

      * Impport;

        data WORK.TNIC3;
          %let _EFIERR_ = 0; /* set the ERROR detection macro variable */
          infile "&dropbox\SASLib\EdgarLogs\tnic3_data.txt" delimiter='09'x MISSOVER
          DSD lrecl=32767 firstobs=2 ;
          informat year best32. ;
          informat gvkey1 best32. ;
          informat gvkey2 best32. ;
          informat score best32. ;
          format year best12. ;
          format gvkey1 best12. ;
          format gvkey2 best12. ;
          format score best12. ;
          input
             year
             gvkey1
             gvkey2
             score
          ;
          if _ERROR_ then call symputx('_EFIERR_',1);  
          run;

        * Convert gvkeys to z6. character; 

        data TNIC3; 
          set TNIC3; 
          gvkey1_c = put(gvkey1, z6.); 
          gvkey2_c = put(gvkey2, z6.); 
          drop gvkey1 gvkey2; 
          rename gvkey1_c = gvkey1
               gvkey2_c = gvkey2
               ;
          run; 

        * Add permno to gvkey ;

        * First, using crsp links; 

        proc sql;
          create table tnic3 as select distinct a.*, 
            b.lpermno as permno1, b.linkprim, b.LINKDT from 
            tnic3 as a left join elog.ccmxpf_linktable as b 
            on input(a.gvkey1, 32.)  = input(b.gvkey, 32.) and mdy(6, 30,a.year)>=b.LINKDT
                and (mdy(06,31,a.year)<=b.LINKENDDT or missing(LINKENDDT)) ; 
            quit; 
        proc sort data = tnic3; 
          by gvkey1 year gvkey2 descending linkprim descending linkdt;
          run;
        data tnic3; set tnic3; 
          by gvkey1 year gvkey2 descending linkprim descending linkdt;
          if first.gvkey2; 
          drop linkprim linkdt;
          run; 

        proc sql;
          create table tnic3 as select distinct a.*, 
            b.lpermno as permno2, b.linkprim, b.LINKDT from 
            tnic3 as a left join elog.ccmxpf_linktable as b 
            on input(a.gvkey2, 32.)  = input(b.gvkey, 32.) and mdy(6, 30,a.year)>=b.LINKDT
                and (mdy(06,31,a.year)<=b.LINKENDDT or missing(LINKENDDT)) ; 
            quit; 
        proc sort data = tnic3; 
          by gvkey1 year gvkey2 descending linkprim descending linkdt;
          run;
        data tnic3; set tnic3; 
          by gvkey1 year gvkey2 descending linkprim descending linkdt;
          if first.gvkey2; 
          drop linkprim linkdt;
          run;

        * Second, using funda links; 

        libname comp "E:\COMP";

        proc sql;
          create table tnic3 as select distinct a.*, 
            b.lpermno as lpermno1
          from tnic3 as a left join comp.funda as b 
          on a.gvkey1 = b.gvkey and a.year = b.fyear; 
          quit;  
        proc sql;
          create table tnic3 as select distinct a.*, 
            b.lpermno as lpermno2
          from tnic3 as a left join comp.funda as b 
          on a.gvkey2 = b.gvkey and a.year = b.fyear; 
          quit;  

        * Combine crsp and comp link; 

        data tnic3; 
          set tnic3; 
          permno1 = coalesce(permno1, lpermno1); 
          permno2 = coalesce(permno2, lpermno2);
          run; 

        * Rid of dups; 

        proc sort data = tnic3 nodupkey; by year gvkey1 gvkey2; run; 

      * HH holded peers (TNIC include self pairs) in the last 12 month;
      * To start with, choose top 10 nearest peers (11 including the self);

        data tnic3; 
          set tnic3; 
          if gvkey1 = gvkey2 then score = 1; 
          run;   
        proc sort data = tnic3; by year gvkey1 descending score; run; 

        data tnic3_top5; 
          set tnic3; 
          by year gvkey1 descending score; 
          if first.gvkey1 then id = 0; 
            id +1; 
          if id<=6;
          run;

        proc sql;
          create table sample5_1 as select distinct 
            a.permno, a.rdq1, a.fyearq, a.fqtr, 
            b.permno2,
            "1" as holding_tnic3_12
          from permno_rdq1 as a, tnic3_top5 as b
          where a.permno = b.permno1 and year(a.rdq1) = b.year; 
          quit; 

        * This could blow up, so slim down position_hh file to geocoded HH; 

        proc sql;
          create table position_hh_geo as select distinct 
            a.*, 
            b.county_fips
          from position_hh as a, infobase as b 
          where a.household_number = b.household_number;
          quit; 

        proc sql;
          create table sample5_1 as select distinct
            a.permno, a.rdq1, a.fyearq, a.fqtr, 
            b.household_number,
            "1" as holding_tnic3_12
          from sample5_1 as a, position_hh_geo(where = (not missing(county_fips))) as b 
          where a.permno2  = b.permno and intnx("month", a.rdq1, -11, "S") <= intnx
          ("month",b.position_date, 0, "E") <=intnx("month", a.rdq1, 0, "S")
          order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
          quit;

      ** HH who traded peers in the last 12 month; 

        proc sql;
          create table sample5_2 as select distinct 
            a.permno, a.rdq1, a.fyearq, a.fqtr, 
            b.permno2,
            "1" as trading_tnic3_12
          from permno_rdq1 as a, tnic3_top5 as b
          where a.permno = b.permno1 and year(a.rdq1) = b.year; 
          quit; 

        * This could blow up, so slim down position_hh file to geocoded HH; 

        proc sql;
          create table trade_hh_geo as select distinct 
            a.*, 
            b.county_fips
          from trade_hh as a, infobase as b 
          where a.household_number = b.household_number;
          quit; 

        proc sql;
          create table sample5_2 as select distinct
            a.permno, a.rdq1, a.fyearq, a.fqtr, 
            b.household_number,
            "1" as trading_tnic3_12
          from sample5_2 as a, trade_hh_geo(where = (not missing(county_fips))) as b 
          where a.permno2  = b.permno and intnx("month", a.rdq1, -11, "S") <= 
          b.trade_date <=intnx("month", a.rdq1, 0, "S")
          order by a.permno, a.rdq1, a.fyearq, a.fqtr,b.household_number; 
          quit;

      **  Combine holding and trading samples;
        * Some permnos may not be included in tnic classifications, so merge sample3 as well; 

        data sample5; 
          merge sample5_1 sample5_2 sample3;  
          by permno rdq1 fyearq fqtr household_number;
          length holding_tnic3_12 $1; 
          length trading_tnic3_12 $1;  
          run;
        * 22,666,618 obs from holding and 15,009,417 obs from trading, resulting in 
          26,113,791 obs merged;

      ** Try matching trade_id to it;

        proc sql;
          create table sample5 as select distinct 
            a.*, 
            b.*
          from sample5 as a left join permno_rdq1 as b 
          on a.permno = b.permno and a.rdq1 = b.rdq1 
            and a.fyearq = b.fyearq and a.fqtr = b.fqtr; 
          quit;  


        proc sql;
          create table sample5_trade as select distinct 
            a.*, 
            b.trade_date, b.trade_id, b.quantity as trade_quantity, 
            b.principal as trade_amount
          from sample5 as a left join trade_hh(where = (not missing(trade_date))) as b 
          on a.household_number = b.household_number and 
             a.permno = b.permno and 
             a.rdq1 <=b.trade_date <=a.day61; 
          quit;

        * How many trades are included: 311,658 distinct trades; 
        proc sql;
          create table number_trades as select distinct 
            trade_id
          from sample5_trade
          where not missing(trade_date); 
          quit;

        * How many trades are there if we mathced them all to permno-rdq1 pairs;
        * There are 979,741 trades; 
        proc sql;
          create table permno_rdq1_trades as select distinct
            a.permno, a.rdq1, a.fyearq, a.fqtr, 
            b.household_number, 
            b.trade_date, b.trade_id, b.quantity as trade_quantity, 
            b.principal as trade_amount
          from permno_rdq1 as a left join trade_hh_geo(where = (not missing(trade_date))) as b 
          on a.permno = b.permno and 
             a.rdq1 <=b.trade_date <=a.day61; 
          quit;
        proc sql;
          create table number_trades as select distinct 
            trade_id
          from permno_rdq1_trades
          where not missing(trade_date); 
          quit;

  * 6. HH who actually traded during [0, 61];  

    proc sql;
      create table sample6 as select distinct 
        a.permno, a.rdq1, a.fyearq, a.fqtr, 
        b.household_number
      from permno_rdq1 as a, trade_hh as b 
      where a.permno = b.permno and  
         a.rdq1 <=b.trade_date <=a.day61; 
      quit;

* For each household level sample, check how many trades can linked in [0, 61]; 

  * Sample1: HH held/traded shares in the preceding month of rdq1; 
  * Sample2: HH held/traded shares in the pre-announcement window [-41, -11]; 
  * Sample3: HH held/traded shares in the month -11 to month -1; 
  * Sample4: HH with more than 10 stock trades in the last 12 month; 
  * Sample5: HH held/traded shares of its own stock and its nearest 5 peers in past 12 month;

  %macro number_trades(input = );


    * Only hh with county_fips; 

    proc sql;
      create table tmp as select distinct 
        a.*, b.county_fips
      from &input as a, infobase as b 
      where a.household_number = b.household_number
        and not missing(b.county_fips);

      create table tmp as select distinct 
        a.*, b.day61
      from tmp as a left join permno_rdq1 as b 
      on a.permno = b.permno and a.rdq1 = b.rdq1 
        and a.fyearq = b.fyearq and a.fqtr = b.fqtr; 

      create table &input._trade as select distinct 
        a.*, 
        b.trade_date, b.trade_id, b.quantity as trade_quantity, 
        b.principal as trade_amount
      from tmp as a left join trade_hh(where = (not missing(trade_date))) as b 
      on a.household_number = b.household_number and 
         a.permno = b.permno and 
         a.rdq1 <=b.trade_date <=a.day61; 
      quit;

    * How many trades are included; 
    proc sql;
      create table &input._uniqueTrades as select distinct 
        trade_id
      from &input._trade
      where not missing(trade_date); 
      quit;

    * How many permno-rdq1-tradingHH paris; 
    proc sql;
      create table &input._uniqueHH as select distinct 
        permno, rdq1, fyearq, fqtr, household_number
      from &input._trade
      where not missing(trade_date); 
      quit;

    %mend;   

  %number_trades(input = sample1); * 378,683 trades; 
  %number_trades(input = sample2); * 442,090 trades; 
  %number_trades(input = sample3); * 461,219trades; 
  %number_trades(input = sample31); * 480,652 trades, using 36 month interval; 
  %number_trades(input = sample6); * 830ktrades; 


  * Conclusion: Out of the total 640k trades possible, 12 month held/traded sample3
    seems to be the sweet spot;

* Construct the final HH sample; 
  
  * Combine sample1, sample2, sample3, sample31, and sample6; 

    data sample_base; 
      merge sample1(in = in1) sample2(in=in2) sample3(in=in3) 
            sample31(in=in4); 
      by permno rdq1 fyearq fqtr household_number; 
      run;

    data sample_base; 
      merge sample_base (in = in1) sample6 (in = in2); 
      by permno rdq1 fyearq fqtr household_number; 
      new_buyer = 0; 
      if not in1 and in2 then new_buyer = 1;  
      run; 


    proc sql;
      create table sample_base as select distinct
        a.*, b.county_fips
      from sample_base as a, infobase as b 
      where a.household_number = b.household_number
        and not missing(b.county_fips)
      order by a.permno, a.rdq1, a.fyearq, a.fqtr; 
      quit;

    proc sort data = sample_base nodupkey;
      by permno rdq1 fyearq fqtr household_number;
      run;
    data check; set sample_base; if new_buyer = 0; run; 
    *%number_trades(input = check); * 508,853 unique trades; 
    *%number_trades(input = sample_base); * 831,100 unique trades; 
    * Roughly 60% trades can be included based on past holding/trading history; 

  * Firm controls;

    proc sql;
      create table sample_base as select distinct 
        a.*, b.*
      from sample_base as a, permno_rdq1 as b 
      where a.permno = b.permno and a.rdq1 = b.rdq1 
        and a.fyearq = b.fyearq and a.fqtr = b.fqtr; 
      quit; 

  * Firm-HH SCI; 

    proc sql; 
      create table sample_base as select distinct 
        a.*, b.sci, b.own_county_pop as firm_pop, 
        b.friend_county_pop as hh_pop, 
        (b.sci/(b.own_county_pop*friend_county_pop))*(10**12)as relative_fp, 
        (b.sci/friend_county_pop)*(10**6) as idegree
      from sample_base as a, fb2 as b 
      where a.county = b.own_county and a.county_fips=b.friend_county; 
      quit;

  * HH controls; 

    * Number of trades in [0, 1] and [2, 61];

      data sample_base; set sample_base; drop ntrades_01 ntrades_261 ntrades_41_11; run;  

      proc sql;

        * [0, 1]; 
        create table sample_base as select distinct 
          a.*, 
          count(distinct trade_id) as ntrades_01
        from sample_base as a left join trade_hh as b 
        on a.permno = b.permno and a.household_number = b.household_number
          and a.rdq1<=b.trade_date<=a.day1 and not missing(b.trade_date)
        group by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number
        order by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number; 

        * [2, 61];
        create table sample_base as select distinct 
          a.*, 
          count(distinct trade_id) as ntrades_261
        from sample_base as a left join trade_hh as b 
        on a.permno = b.permno and a.household_number = b.household_number
          and a.day1<b.trade_date<=a.day61 and not missing(b.trade_date) 
        group by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number
        order by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number; 

        * [-41, -11]; 
        create table sample_base as select distinct 
          a.*, 
          count(distinct trade_id) as ntrades_41_11
        from sample_base as a left join trade_hh as b 
        on a.permno = b.permno and a.household_number = b.household_number
          and a.day_41<=b.trade_date<=a.day_11 and not missing(b.trade_date) 
        group by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number
        order by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number; 

        quit;

    * Total shareturnover in [0, 1] and [2, 61]

      * Scale trade amount by begining-of-month ptf value;
      * Trade can be recorded further than position data, so match the nearest
        pft values;


      proc sql;
        create table relative_trade as select distinct 
          a.*, 
          a.principal/b.ptf as td_ptf, 
          a.principal/b.stock_ptf as td_stock_ptf,
          a.trade_date-intnx("month", b.date, 0, "E") as time_diff
        from trade_hh as a left join ptf_hh_final as b 
        on a.household_number = b.household_number and 
          a.trade_date > intnx("month", b.date, 0, "E") 
        group by a.trade_id
        having abs(time_diff) = min(abs(trade_date-intnx("month", b.date, 0, "E")))
        order by a.trade_id; 
        quit; 

      * With properly infered ptf_hh_final dataset, the matching of trade to 
        ptf balance is much more improved!; 
      * proc means data = relative_trade; 
      *   var time_diff; 
      *   run ;


      * Sum of trades and relative trades;

      data sample_base; set sample_base; 
        drop td_ptf: td_stock_ptf: quantity: principal: ; 
        run;  

      proc sql;
        create table sample_base as select distinct 
          a.*, 
          sum(abs(td_ptf)*(a.rdq1<=b.trade_date<=a.day1)) as td_ptf01, 
          sum(abs(td_ptf)*(a.day1<b.trade_date<=a.day61)) as td_ptf261, 
          sum(abs(td_ptf)*(a.day_41<=b.trade_date<=a.day_11)) as td_ptf_41_11,
          sum(abs(td_stock_ptf)*(a.rdq1<=b.trade_date<=a.day1)) as td_stock_ptf01, 
          sum(abs(td_stock_ptf)*(a.day1<b.trade_date<=a.day61)) as td_stock_ptf261, 
          sum(abs(td_stock_ptf)*(a.day_41<=b.trade_date<=a.day_11)) as td_stock_ptf_41_11,
          sum(abs(quantity)*(a.rdq1<=b.trade_date<=a.day1)) as quantity01, 
          sum(abs(quantity)*(a.day1<b.trade_date<=a.day61)) as quantity261, 
          sum(abs(quantity)*(a.day_41<=b.trade_date<=a.day_11)) as quantity_41_11,
          sum(abs(principal)*(a.rdq1<=b.trade_date<=a.day1)) as principal01, 
          sum(abs(principal)*(a.day1<b.trade_date<=a.day61)) as principal261, 
          sum(abs(principal)*(a.day_41<=b.trade_date<=a.day_11)) as principal_41_11
        from sample_base as a left join relative_trade as b 
        on a.permno = b.permno and a.household_number = b.household_number
          and a.day_41<=b.trade_date<=a.day61 and not missing(b.trade_date)
        group by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number
        order by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number; 
        quit;

    * Demogrpahics; 

      proc sql;
        create table sample_base as select distinct 
          a.*, 
          input(b.income_code, 8.) as hh_income,
          b.martial_status as hh_married, 
          b.HH_gender
        from sample_base as a left join infobase as b 
        on a.household_number = b.household_number; 
        quit; 

    * Past-trading activity; 

      * Past 3, 6, 12-month number of trades; 

        proc printto log=junk;run;

        proc expand data = trade_freq
                    out  = past_tradees;
            by household_number;
            id date;
            convert ntrades= ntrades_12m / method=none transform=(movsum 12);
            convert ntrades_stock= ntrades_stock_12m / method=none transform=(movsum 12);
            convert ntrades= ntrades_3m / method=none transform=(movsum 3);
            convert ntrades_stock= ntrades_stock_3m / method=none transform=(movsum 3);
            convert ntrades= ntrades_6m / method=none transform=(movsum 6);
            convert ntrades_stock= ntrades_stock_6m / method=none transform=(movsum 6);
            run; 

        proc printto; run;

        proc sql;
          create table sample_base as select distinct
            a.*, 
            b.ntrades_12m, b.ntrades_6m, b.ntrades_3m, 
            b.ntrades_stock_12m, b.ntrades_stock_6m, b.ntrades_stock_3m,
            12*(year(b.date)-year(b.household_open_date))+
              (month(b.date)-month(b.household_open_date))+1 as month_since_open
          from sample_base as a left join past_tradees as b 
          on a.household_number = b.household_number and 
             intnx("month", a.rdq1, -1, "E") = intnx("month", b.date, 0, "E"); 
          quit; 

      * Past 3, 6, 12-month turnover, together with ptf balance;  

        * Following Barber and Odean (2000), calcualte turnover as the buy from t
          and sales from t+1 dived by month-end portfolio balance of t; 
        * Buy and sale values are market values at the end of month t; 
        * Since not all trades can be matched to position data, use actual price 
          as backup; 
        proc sql;
          create table buysell as select distinct a.*,
            coalesce(a.principal, b.position_equity/b.position_quantity*a.quantity)
            as dvom_t, b.date as month_t
          from trade_hh as a left join position_hh_final as b 
          on a.household_number = b.household_number and 
            (
              (intnx("month",a.trade_date, 0, "E") = intnx("month",
               b.date,0, "E") and a.buy_sell = "B") 
              or 
              (intnx("month",a.trade_date, -1, "E") = intnx("month",
                b.date,0,"E") and a.buy_sell="S")
            )
             and 
            substr(strip(a.cusip), 1, 8)  = substr(strip(b.cusip),1,8); 
          quit; * Note, not all trades are matched to position data;  

        proc sort data = buysell nodupkey; by trade_id;run;

        proc sql;
          create table turnover_hh_final as select distinct a.household_number, a.date,
            a.ptf, a.stock_ptf,
            sum(abs(b.dvom_t))/a.ptf as turn, 
            sum(abs(b.dvom_t))/a.stock_ptf as turns,
            sum(abs(b.dvom_t*(buy_sell="B")))/a.ptf as pturn, 
            sum(abs(b.dvom_t*(buy_sell="B")))/a.stock_ptf as pturns,
            sum(abs(b.dvom_t*(buy_sell="S")))/a.ptf as sturn, 
            sum(abs(b.dvom_t*(buy_sell="S")))/a.stock_ptf as sturns
          from ptf_hh_final as a left join buysell as b 
          on a.household_number = b.household_number and a.date = b.month_t
          group by a.household_number, a.date; 
          quit; 

        * Fill in missing turnover as zeros (i.e., there are no trades matched); 
        proc sort data = turnover_hh_final nodupkey; by household_number date; run;   
        data turnover_hh_final;
          set turnover_hh_final; 
          /* Ptf turnover */
          if missing(turn) then turn = 0; 
          if missing(pturn) then pturn = 0; 
          if missing(sturn) then sturn = 0; 

          /* Stock Ptf turnover */
          if not missing(stock_ptf) then do; 
            if missing(turns) then turns = 0; 
            if missing(pturns) then pturns = 0; 
            if missing(sturns) then sturns = 0; 
          end; 
          run;

        proc printto log=junk;run;
        proc expand data = turnover_hh_final
                    out  = past_turnover;
            by household_number;
            id date;

            convert turn = turn_3m / method=none transform=(movave 3);
            convert turn = turn_6m / method=none transform=(movave 6);
            convert turn = turn_12m / method=none transform=(movave 12);
            convert turns = turns_3m / method=none transform=(movave 3);
            convert turns = turns_6m / method=none transform=(movave 6);
            convert turns = turns_12m / method=none transform=(movave 12);

            run; 

        proc printto;run;

        proc sql;
          create table sample_base as select distinct
            a.*, 
            b.turn_12m, b.turn_6m, b.turn_3m, 
            b.turns_12m, b.turns_6m, b.turns_3m, 
            b.ptf as rdq1_ptf, b.stock_ptf as rdq1_stock_ptf
          from sample_base as a left join past_turnover as b 
          on a.household_number = b.household_number and 
             intnx("month", a.rdq1, -1, "E") = intnx("month", b.date, 0, "E"); 
          quit; 

      * Numerb of stocks in the portfolio; 

        proc sql;
          create table ptf_comp as select distinct 
            household_number, date, count(distinct permno) as ptf_nstocks,
            count(distinct cusip) as ptf_nassets
          from position_hh_final 
          group by household_number, date; 
          quit; 

        proc sql; 
          create table sample_base as select distinct
            a.*, b.ptf_nstocks, b.ptf_nassets
          from sample_base as a left join ptf_comp as b 
          on a.household_number = b.household_number and 
             intnx("month", a.rdq1, -1, "E") = intnx("month", b.date, 0, "E");
          quit; 

  * Check for valid regression obs;

    * Conclusion: Sind_p_ctex has a lot of missing values;  

    %let vars = permno, rdq1, fyearq, fqtr, household_number,
                logme, logbm, idvol, ior, urban, sp500, retail, ep2_winsor, 
                evol2_winsor, nrdq_rk2, rptlag, log_xad, pop_density;* sind_p_ctex; 
    %let yvar =  
                 ntrades_01, ntrades_261;
    %let svar = relative_fp,idegree; 

    data check; 
      set sample_base; 
      if not missing(sci) and missing(sind_p_ctex); 
      run;

    data sample_base; 
      set sample_base; 
      sind_p_ctex = coalesce(sind_p_ctex, 0); 
      run; 

  * Add spatial network measures; 


    PROC IMPORT OUT= WORK.county_loc
                DATAFILE= "D:\Dropbox\SASLib\Thesis\County_locations.csv" 
                DBMS=CSV REPLACE;
         GETNAMES=Yes;
         DATAROW=2; 
    RUN;

    * Match lat and lon to home county and calcualte pairwise distances; 
    proc sql;
      create table fb2
      as select a.*, b.INTPTLAT, b.INTPTLONG
      from fb2 as a, county_loc as b 
      where a.own_county = geoid; 
      quit; 

    proc sql;
      create table fb2
      as select a.own_county, a.friend_county, a.sci, a.from, a.to, 
        a.own_county_pop, a.friend_county_pop, b.ALAND,
        1/geodist(a.INTPTLAT, a.INTPTLONG, b.INTPTLAT, b.INTPTLONG) as dist_inv,
        0.533/(1+0.032*geodist(a.INTPTLAT, a.INTPTLONG, b.INTPTLAT, b.INTPTLONG))**2.788
        as friend_prop
      from fb2 as a, county_loc as b
      where a.friend_county=b.geoid; 
      quit;

      data fb2; 
        set fb2; 
        sci_gravity = own_county_pop*friend_county_pop*distsqrd_inv; 
        * sci_geo1 is missing for i=j; 
        * Calcualte average distiance in a uniform square, which is equal to 0.52140;
        * Scale that by \sqrt(ALAND);
        if from = to then sci_gravity = own_county_pop*friend_county_pop/((0.5214*sqrt(aland))**2);
        run ; 

    * Multiply distance based friendship prop by population ;
    * sci_geo1 is based on inverse of distance; 
    * sci_geo2 is based on Butts(2002)'s calibration;
    data fb2; 
      set fb2; 
      sci_geo1 = own_county_pop*friend_county_pop*dist_inv; 
      sci_geo2 = own_county_pop*friend_county_pop*friend_prop;
      sci_gravity = own_county_pop*friend_county_pop*(dist_inv**2); 
      * sci_geo1 is missing for i=j; 
      * Calcualte average distiance in a uniform square, which is equal to 0.52140;
      * Scale that by \sqrt(ALAND);
      if from = to then sci_geo1 = own_county_pop*friend_county_pop/(0.5214*sqrt(aland));
      if from = to then sci_gravity = own_county_pop*friend_county_pop/((0.5214*sqrt(aland))**2);
      run ;

    * merge back to the main dataset; 
    proc sql; 
      create table sample_base as select distinct 
        a.*, b.sci_geo2, b.sci_geo1, b.sci_gravity
      from sample_base as a, fb2 as b 
      where a.county = b.own_county and a.county_fips=b.friend_county; 
      quit;

/*******************************************************************************/
/****************** Are trades more harmful to high RSCI households? ***********/
/*******************************************************************************/

* This regression follows Echo Chamber in which they tests sentiment declaration on 
  StockTwits and subsequent returns; 

* Prepare for stock return data and dgtw-adjusted returns; 

  libname crsp "E:\CRSP";

  data crsp_d;
    set crsp.crsp_d;
    if shrcd in (10,11);
    /*  if exchcd in (1 2 3);*/
    P = abs(prc)/cfacpr;
    
    *adjust NASDAQ volume according to Gao and Ritter (2010);
    vol_adj = vol;
    if exchcd=3 and date < mdy(02,01, 2001) then
      vol_adj = vol/2;
    if exchcd=3 and mdy(02,1, 2001)<= date <= mdy(12,31, 2001) then
      vol_adj = vol/1.8;
    if exchcd=3 and year(date) in (2002 2003) then
      vol_adj = vol/1.6;
    if exchcd=3 and year(date) >=2004 then
      vol_adj = vol/1.0;
    VOM = vol_adj*cfacshr;
    * in the version before 10/20/2018, the above one-line code was 
      DVOM = log(1+VOM*P) and it was wrong;
    DVOM = vol_adj*abs(prc);
    TSO = shrout*cfacshr*1000;

    if TSO <= 0 then TSO = .;
    if VOM < 0 then VOM = .;
    if DVOM < 0 then DVOM = .;
    
    label P = "Price at Period End, Adjusted";
    label TSO = "Total Shares Outstanding, Adjusted";
    label VOM = "Trading Volume Daily (logged), Adjusted";
    label DVOM = "Trading Volume Daily in Dollars, (logged) Adjusted";
    format ret percentn8.4 P DVOM dollar12.3 TSO VOM comma12. ;

    keep permno date P vom dvom tso TSO cfacpr cfacshr ret vwretd; 

    run;

  proc sql;
    create table crsp_d
    as select distinct a.*, b.date as formdate "Formation Date", b.dgtw_port
    from crsp_d as a left join rep.dgtw_assignment_wrds as b
    on a.permno=b.permno and b.date < a.date<=intnx('month', b.date,12,'e');


    create table crsp_d (index=(perm_dat=(permno date)))
    as select a.*,b.DGTW_VWRET format percentn8.4 "DGTW Benchmark Return",
      (a.ret-b.DGTW_VWRET) as DGTW_XRET "DGTW Excess Return" format percentn8.4
    from crsp_d as a left join rep.dgtw_vwret as b
    on a.dgtw_port=b.dgtw_port and a.date=b.date;
  quit;

* Add sci and adjust shares and trade prices for splits; 

  proc sql; 
    create table trade_hh_geo1 as select distinct 
      a.*, b.county
    from trade_hh_geo as a left join permno_rdq1 as b 
    on a.permno = b.permno and b.rdq1<=a.trade_date<b.next_rdq1;

    create table trade_hh_geo2 as select distinct 
      a.*, b.sci, b.own_county_pop as firm_pop, 
      b.friend_county_pop as hh_pop, 
      (b.sci/(b.own_county_pop*friend_county_pop))*(10**12)as relative_fp, 
      (b.sci/friend_county_pop)*(10**6) as idegree,
      dist_inv, sci_geo1, sci_geo2, sci_gravity 
    from trade_hh_geo1 as a,  fb2 as b 
    where a.county = b.own_county and a.county_fips=b.friend_county; 

    create table trade_hh_geo2 as select distinct 
      a.*, a.quantity*cfacshr as quantity_adj, 
      a.price/cfacpr as price_adj
    from trade_hh_geo2 as a left join crsp_d as b
    on a.permno = b.permno and a.trade_date = b.date; 
    quit; 

* For each trade, calcualte future returns; 
* 1-, 2-, 3-, 6-, and 12-month; 
* Until next rdq1 returns; 

  * Add most recent quarterly contorls; 

    proc sql;
      create table trade_return as select distinct 
        a.household_number, a.permno, a.trade_date, 
        a.quantity_adj, a.quantity, 
        a.price_adj, a.price, 
        a.principal, a.trade_commission, 
        a.trade_id, a.county_fips, a.county, 
        a.sci, a.firm_pop, a.hh_pop, a.relative_fp, 
        a.idegree, a.dist_inv, a.sci_geo1, a.sci_geo2, 
        a.sci_gravity, a.buy_sell,
        b.*
      from trade_hh_geo2 as a, permno_rdq1 as b
      where a.permno = b.permno and not missing(a.trade_date)
        and b.rdq1<=a.trade_date<b.next_rdq1; 
      quit; 


  * Calculate returns;

    proc sql;
      create table trade_return as select distinct  
        a.*,
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<=a.day61))) -1 as cret_day61, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<a.next_rdq1))) -1 as cret_nextrdq1, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<intnx("day", a.trade_date, 7)))) -1 as cret_5d, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<intnx("day", a.trade_date, 14)))) -1 as cret_10d, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<intnx("day", a.trade_date, 21)))) -1 as cret_15d, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<intnx("month", a.trade_date, 1, "S")))) -1 as cret_1m, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<intnx("month", a.trade_date, 2, "S")))) -1 as cret_2m, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<intnx("month", a.trade_date, 3, "S")))) -1 as cret_3m, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<intnx("month", a.trade_date, 6, "S")))) -1 as cret_6m, 
        exp(sum(log(1+b.ret)*(a.trade_date<b.date<intnx("month", a.trade_date, 12, "S")))) -1 as cret_12m, 

        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<=a.day61))) -1 as cvwret_day61, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<a.next_rdq1))) -1 as cvwret_nextrdq1, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<intnx("day", a.trade_date, 7)))) -1 as cvwret_5d, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<intnx("day", a.trade_date, 14)))) -1 as cvwret_10d, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<intnx("day", a.trade_date, 21)))) -1 as cvwret_15d, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<intnx("month", a.trade_date, 1, "S")))) -1 as cvwret_1m, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<intnx("month", a.trade_date, 2, "S")))) -1 as cvwret_2m, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<intnx("month", a.trade_date, 3, "S")))) -1 as cvwret_3m, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<intnx("month", a.trade_date, 6, "S")))) -1 as cvwret_6m, 
        exp(sum(log(1+b.dgtw_vwret)*(a.trade_date<b.date<intnx("month", a.trade_date, 12, "S")))) -1 as cvwret_12m
      from trade_return as a left join crsp_d as b 
      on a.permno = b.permno and a.trade_date<=b.date<=intnx("month", a.trade_date, 12, "S")
      group by a.trade_id; 
      quit;

    data trade_return_backup; 
      set trade_return; 
      run;

  data trade_return; 
    set trade_return; 
    array x (*) cret_:; 
    array y (*) cvwret_:; 
    array z (*) cxret_day61  cxret_nextrdq1 cxret_5d cxret_10d 
      cxret_15d cxret_1m cxret_2mn cxret_3m cxret_6m cxret_12m; 
    do i = 1 to dim(x); 
      z(i) = x(i)-y(i); 
    end; 
    drop i; 
    run; 
    


* Adding HH and trade controls controls; 

  * Demogrphics;

    proc sql;
      create table trade_return1 as select distinct 
        a.*, 
        input(b.income_code, 8.) as hh_income,
        b.martial_status as hh_married, 
        b.HH_gender
      from trade_return as a left join infobase as b 
      on a.household_number = b.household_number; 
      quit; 

  * Past number of trades, turnover, ptf compositoin; 

    proc sql;
      create table trade_return1 as select distinct
        a.*, 
        b.ntrades_12m, b.ntrades_6m, b.ntrades_3m, 
        b.ntrades_stock_12m, b.ntrades_stock_6m, b.ntrades_stock_3m,
        12*(year(b.date)-year(b.household_open_date))+
          (month(b.date)-month(b.household_open_date))+1 as month_since_open
      from trade_return1 as a left join past_tradees as b 
      on a.household_number = b.household_number and 
         intnx("month", a.trade_date, -1, "E") = intnx("month", b.date, 0, "E"); 
      quit;

    proc sql;
      create table trade_return1 as select distinct
        a.*, 
        b.turn_12m, b.turn_6m, b.turn_3m, 
        b.turns_12m, b.turns_6m, b.turns_3m, 
        b.ptf as ptf, b.stock_ptf as stock_ptf
      from trade_return1 as a left join past_turnover as b 
      on a.household_number = b.household_number and 
         intnx("month", a.trade_date, -1, "E") = intnx("month", b.date, 0, "E"); 
      quit; 


    proc sql; 
      create table trade_return1 as select distinct
        a.*, b.ptf_nstocks, b.ptf_nassets
      from trade_return1 as a left join ptf_comp as b 
      on a.household_number = b.household_number and 
         intnx("month", a.trade_date, -1, "E") = intnx("month", b.date, 0, "E");
      quit;


  * Past trades of the same stock; 

    %macro past_strades(month= ); 

      proc sql;
        create table trade_return1 as select distinct 
          a.*, 
          count(distinct b.trade_id) as nstrades_&month.m, 
          coalesce(sum(abs(b.td_ptf)), 0) as td_ptf_&month.m, 
          coalesce(sum(abs(b.td_stock_ptf)), 0) as td_stock_ptf_&month.m,
          coalesce(sum(abs(b.principal)), 0) as principal_&month.m
        from trade_return1 as a left join relative_trade as b
        on a.household_number = b.household_number and
          a.permno = b.permno and  
          intnx("month", a.trade_date, -&month, "S")<= b.trade_date
          <a.trade_date
        group by a.trade_id;
        quit; 

      %mend; 

    %past_strades(month=1); 
    %past_strades(month=3); 
    %past_strades(month=6); 
    %past_strades(month=12);

    %let month = rdq1;
    proc sql;
      create table trade_return1 as select distinct 
        a.*, 
        count(distinct b.trade_id) as nstrades_&month.m, 
        coalesce(sum(abs(b.td_ptf)), 0) as td_ptf_&month.m, 
        coalesce(sum(abs(b.td_stock_ptf)), 0) as td_stock_ptf_&month.m,
        coalesce(sum(abs(b.principal)), 0) as principal_&month.m
      from trade_return1 as a left join relative_trade as b
      on a.household_number = b.household_number and
        a.permno = b.permno and  
        rdq1<= b.trade_date
        <a.trade_date
      group by a.trade_id;
      quit; 
    %let month = day1;
    proc sql;
      create table trade_return1 as select distinct 
        a.*, 
        count(distinct b.trade_id) as nstrades_&month.m, 
        coalesce(sum(abs(b.td_ptf)), 0) as td_ptf_&month.m, 
        coalesce(sum(abs(b.td_stock_ptf)), 0) as td_stock_ptf_&month.m,
        coalesce(sum(abs(b.principal)), 0) as principal_&month.m
      from trade_return1 as a left join relative_trade as b
      on a.household_number = b.household_number and
        a.permno = b.permno and  
        day1< b.trade_date
        <a.trade_date
      group by a.trade_id;
      quit; 

* Add sample selection dummy from odean_hhnew; 

  proc sql;
    create table trade_return1 as select distinct 
      a.*,
      b.*
    from trade_return1 as a left join sample_base(keep = permno rdq1
      fyearq fqtr holding_: trading_: household_number) as b 
    on a.permno = b.permno and a.rdq1 = b.rdq1 and a.fyearq=b.fyearq and 
      a.fqtr = b.fqtr and a.household_number = b.household_number; 
    quit; 

  data trade_return1; 
    set trade_return1; 
    sind_p_ctex = coalesce(sind_p_ctex, 0); 
    run;

* Add Bid-ask spread component of buy and sells following Barber and Odean (2000) eq(1);
  
  * First, replicate Barber/Odean's statistics; 

    data check; set trade; if not missing(permno); run ;

    proc sql;
      create table trade_hh_spread as select 
        a.*, 
        case 
          when a.buy_sell = "B" then -(abs(b.prc)/a.price-1)
          when a.buy_sell = "S" then (abs(b.prc)/a.price -1)
        end as spread
      from trade_hh as a left join crsp.crsp_d as b 
      on a.permno = b.permno and a.trade_date = b.date and b.prc>0;
      quit;

    PROC EXPORT DATA= WORK.trade_hh_spread
                OUTFILE= "D:\Dropbox\SASLib\Thesis\Stata\odean_spread.dta" 
                DBMS=STATA REPLACE;
          RUN;




  proc sql;
    create table trade_return2 as select 
      a.*, 
      case 
        when a.buy_sell = "B" then -(abs(b.prc)/a.price-1)
        when a.buy_sell = "S" then (abs(b.prc)/a.price -1)
      end as spread, 
      sum(calculated spread, trade_commission/abs(principal)) 
      as trade_cost 
    from trade_return1 as a left join crsp.crsp_d as b 
    on a.permno = b.permno and a.trade_date = b.date and b.prc>0;
    quit;  



/********************************************************************************
                           Add back the traidng costs
********************************************************************************/

  * Calculate net return of each trade, following Barber/Odean 2000; 
  * dnret: dollar profit and losses based on raw return net of tradidng cost;  
  * dxnret: dollar profit and losses based on abnormal dgtw return net of trading cost;
  * Price should be Closing Price - Purchase Price, or (1+Ret_crsp)*(1-spread) for buy 
    and (1+Ret_crsp)/(1+spread) for sale; 
  * Decompose the dollar profit and losses into three parts: 
    1. CRSP cumulative returns estimated form closing to closing prices
    2. Spread cost calculated from traded price to closing price
    3. Trade commission; 

    proc sql;
      create table trade_return3 as select distinct a.*,
        b.prc as trade_date_clprc
      from trade_return2 as a left join crsp.crsp_d as b 
      on a.permno = b.permno and a.trade_date = b.date; 
      quit; 

    data trade_return3; 
      set trade_return3; 
      * dollar P&L; 
      if buy_sell = "B" then do; 
        pl = principal*((1-spread)*(1+cret_day61)-1) - trade_commission;  
        pl_spread = principal*((1-spread) -1); 
        pl_ret = principal*(1-spread)*cret_day61; 
        pl_commission = - trade_commission; 
        pldgtw = principal*((1-spread)*(1+cxret_day61)-1) - trade_commission;  
        pldgtw_spread = principal*((1-spread) -1); 
        pldgtw_ret = principal*(1-spread)*cxret_day61; 
        pldgtw_commission = - trade_commission; 

      end; 
      if buy_sell = "S" then do; 
        pl = principal*((1+spread)*(1+cret_day61)-1) - trade_commission;  
        pl_spread = principal*((1+spread) -1); 
        pl_ret = principal*(1+spread)*cret_day61; 
        pl_commission = -trade_commission; 
        pldgtw = principal*((1+spread)*(1+cxret_day61)-1) - trade_commission;  
        pldgtw_spread = principal*((1+spread) -1); 
        pldgtw_ret = principal*(1+spread)*cxret_day61; 
        pldgtw_commission = -trade_commission; 
      end; 

      pl_spread_check = principal*(abs(trade_date_clprc)/price-1); 

      array x(*) pl pl_spread pl_ret pl_commission 
                pldgtw pldgtw_spread pldgtw_ret pldgtw_commission  ;
      array y(*) rpl rpl_spread rpl_ret rpl_commission 
                rpldgtw rpldgtw_spread rpldgtw_ret rpldgtw_commission  ;
      do i = 1 to dim(x); 
        y(i) = x(i)/stock_ptf; 
      end; 
      drop i; 
      run; 


  * Calcualte and aggregate dollar P&L for just day61 window; 

    data sample_base; 
      set sample_base; 
      drop rpl_nextrdq1: rpl_5d: rpl_10d: rpl_15d: rpl_1m: rpl_2m: 
        rpl_3m: rpl_6m: rpl_12m: ;  
      drop rpldgtw_nextrdq1: rpldgtw_5d: rpldgtw_10d: rpldgtw_15d: 
          rpldgtw_1m: rpldgtw_2m: rpldgtw_3m: rpldgtw_6m: rpldgtw_12m: ;  
      drop pl_nextrdq1: pl_5d: pl_10d: pl_15d: pl_1m: pl_2m: 
        pl_3m: pl_6m: pl_12m: ;  
      drop pldgtw_nextrdq1: pldgtw_5d: pldgtw_10d: pldgtw_15d: 
          pldgtw_1m: pldgtw_2m: pldgtw_3m: pldgtw_6m: pldgtw_12m: ;  
      run; 

    proc sql;
      create table sample_base as select distinct a.*, 
        sum(b.pl*(a.rdq1<=b.trade_date<=a.day1)) as pl_01, 
        sum(b.pl*(a.day1<b.trade_date<=a.day61)) as pl_261, 
        sum(b.pl*(a.day_41<=b.trade_date<=a.day_11)) as pl_41_11,
        sum(b.pl_spread*(a.rdq1<=b.trade_date<=a.day1)) as pl_spread_01, 
        sum(b.pl_spread*(a.day1<b.trade_date<=a.day61)) as pl_spread_261, 
        sum(b.pl_spread*(a.day_41<=b.trade_date<=a.day_11)) as pl_spread_41_11,
        sum(b.pl_ret*(a.rdq1<=b.trade_date<=a.day1)) as pl_ret_01, 
        sum(b.pl_ret*(a.day1<b.trade_date<=a.day61)) as pl_ret_261, 
        sum(b.pl_ret*(a.day_41<=b.trade_date<=a.day_11)) as pl_ret_41_11,
        sum(b.pl_commission*(a.rdq1<=b.trade_date<=a.day1)) as pl_commission_01, 
        sum(b.pl_commission*(a.day1<b.trade_date<=a.day61)) as pl_commission_261, 
        sum(b.pl_commission*(a.day_41<=b.trade_date<=a.day_11)) as pl_commission_41_11,

        sum(b.pldgtw*(a.rdq1<=b.trade_date<=a.day1)) as pldgtw_01, 
        sum(b.pldgtw*(a.day1<b.trade_date<=a.day61)) as pldgtw_261, 
        sum(b.pldgtw*(a.day_41<=b.trade_date<=a.day_11)) as pldgtw_41_11,
        sum(b.pldgtw_spread*(a.rdq1<=b.trade_date<=a.day1)) as pldgtw_spread_01, 
        sum(b.pldgtw_spread*(a.day1<b.trade_date<=a.day61)) as pldgtw_spread_261, 
        sum(b.pldgtw_spread*(a.day_41<=b.trade_date<=a.day_11)) as pldgtw_spread_41_11,
        sum(b.pldgtw_ret*(a.rdq1<=b.trade_date<=a.day1)) as pldgtw_ret_01, 
        sum(b.pldgtw_ret*(a.day1<b.trade_date<=a.day61)) as pldgtw_ret_261, 
        sum(b.pldgtw_ret*(a.day_41<=b.trade_date<=a.day_11)) as pldgtw_ret_41_11,
        sum(b.pldgtw_commission*(a.rdq1<=b.trade_date<=a.day1)) as pldgtw_commission_01, 
        sum(b.pldgtw_commission*(a.day1<b.trade_date<=a.day61)) as pldgtw_commission_261, 
        sum(b.pldgtw_commission*(a.day_41<=b.trade_date<=a.day_11)) as pldgtw_commission_41_11, 

        sum(b.rpl*(a.rdq1<=b.trade_date<=a.day1)) as rpl_01, 
        sum(b.rpl*(a.day1<b.trade_date<=a.day61)) as rpl_261, 
        sum(b.rpl*(a.day_41<=b.trade_date<=a.day_11)) as rpl_41_11,
        sum(b.rpl_spread*(a.rdq1<=b.trade_date<=a.day1)) as rpl_spread_01, 
        sum(b.rpl_spread*(a.day1<b.trade_date<=a.day61)) as rpl_spread_261, 
        sum(b.rpl_spread*(a.day_41<=b.trade_date<=a.day_11)) as rpl_spread_41_11,
        sum(b.rpl_ret*(a.rdq1<=b.trade_date<=a.day1)) as rpl_ret_01, 
        sum(b.rpl_ret*(a.day1<b.trade_date<=a.day61)) as rpl_ret_261, 
        sum(b.rpl_ret*(a.day_41<=b.trade_date<=a.day_11)) as rpl_ret_41_11,
        sum(b.rpl_commission*(a.rdq1<=b.trade_date<=a.day1)) as rpl_commission_01, 
        sum(b.rpl_commission*(a.day1<b.trade_date<=a.day61)) as rpl_commission_261, 
        sum(b.rpl_commission*(a.day_41<=b.trade_date<=a.day_11)) as rpl_commission_41_11,
        
        sum(b.rpldgtw*(a.rdq1<=b.trade_date<=a.day1)) as rpldgtw_01, 
        sum(b.rpldgtw*(a.day1<b.trade_date<=a.day61)) as rpldgtw_261, 
        sum(b.rpldgtw*(a.day_41<=b.trade_date<=a.day_11)) as rpldgtw_41_11,
        sum(b.rpldgtw_spread*(a.rdq1<=b.trade_date<=a.day1)) as rpldgtw_spread_01, 
        sum(b.rpldgtw_spread*(a.day1<b.trade_date<=a.day61)) as rpldgtw_spread_261, 
        sum(b.rpldgtw_spread*(a.day_41<=b.trade_date<=a.day_11)) as rpldgtw_spread_41_11,
        sum(b.rpldgtw_ret*(a.rdq1<=b.trade_date<=a.day1)) as rpldgtw_ret_01, 
        sum(b.rpldgtw_ret*(a.day1<b.trade_date<=a.day61)) as rpldgtw_ret_261, 
        sum(b.rpldgtw_ret*(a.day_41<=b.trade_date<=a.day_11)) as rpldgtw_ret_41_11,
        sum(b.rpldgtw_commission*(a.rdq1<=b.trade_date<=a.day1)) as rpldgtw_commission_01, 
        sum(b.rpldgtw_commission*(a.day1<b.trade_date<=a.day61)) as rpldgtw_commission_261, 
        sum(b.rpldgtw_commission*(a.day_41<=b.trade_date<=a.day_11)) as rpldgtw_commission_41_11

      from sample_base as a left join trade_return3 as b 
      on a.permno = b.permno and a.household_number = b.household_number
        and a.day_41<=b.trade_date<=a.day61 and not missing(b.trade_date)
      group by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number
      order by a.permno, a.rdq1, a.fyearq, a.fqtr, a.household_number; 
      quit;

  * Export; 

    PROC EXPORT DATA= WORK.sample_base
                    OUTFILE= "D:\Dropbox\SASLib\Thesis\Stata\odean_hhnew.dta" 
                    DBMS=STATA REPLACE;
              RUN;
